#### libraries
library(tm)
library(xtable)

#### set working directory to folder containing data before executing

load("openendpollwordslowthr121611.Rdata")

#### LOAD PRESS RELEASE DATA
load("pressrelease070511.Rdata")
dta.pr <- JSS_dtm3

rm(JSS_dtm3)

#################

idx.overlap.oe.pr <- which(colnames(dt1a.fav) %in% colnames(dta.pr))
idx.overlap.pr <- which(colnames(dta.pr) %in% colnames(dt1a.fav))
length(idx.overlap.pr)

#################

dt1a.fav.or <- dt1a.fav[,idx.overlap.oe.pr]
dt1a.unfav.or <- dt1a.unfav[,idx.overlap.oe.pr]

dt2a.fav.or <- dt2a.fav[,idx.overlap.oe.pr]
dt2a.unfav.or <- dt2a.unfav[,idx.overlap.oe.pr]

dt3a.fav.or <- dt3a.fav[,idx.overlap.oe.pr]
dt3a.unfav.or <- dt3a.unfav[,idx.overlap.oe.pr]

dt4a.fav.or <- dt4a.fav[,idx.overlap.oe.pr]
dt4a.unfav.or <- dt4a.unfav[,idx.overlap.oe.pr]

dim(dt3a.unfav.or)

#################

PR.overlap <- dta.pr[,idx.overlap.pr]
identical(colnames(dt3a.unfav.or ),colnames(PR.overlap))

n.docs <- length(rownames(PR.overlap))
doc.list <- rownames(PR.overlap)
party <- name <- month <- year <- day <- c()
for(i in 1:n.docs){
	month[i] <- strsplit(doc.list[i]," - ")[[1]][1]
	year[i] <- strsplit(doc.list[i]," - ")[[1]][3]
	day[i] <- strsplit(doc.list[i]," - ")[[1]][2]
	name[i] <- strsplit(doc.list[i]," - ")[[1]][4]
	party[i] <- strsplit(doc.list[i]," - ")[[1]][6]
}


period1 <- 1*(month %in% c("Jul","Aug","Sep","Oct") & year %in% c("09","2009"))
PR.D.dta.1 <- PR.overlap[party=="D.txt" & ! party %in% c(NA) & period1==1,]
rownames(PR.D.dta.1)

PR.R.dta.1 <- PR.overlap[party=="R.txt" & ! party %in% c(NA) & period1==1,]
rownames(PR.R.dta.1)

period2 <- 1*((month %in% c("Nov","Dec") & year %in% c("09","2009")) | (month %in% c("Jan","Feb","Mar","Apr") & year %in% c("010","2010","10"))) 

PR.D.dta.2 <- PR.overlap[party=="D.txt" & ! party %in% c(NA) & period2==1,]
rownames(PR.D.dta.2)

PR.R.dta.2 <- PR.overlap[party=="R.txt" & ! party %in% c(NA) & period2==1,]
rownames(PR.R.dta.2)

#### merge OE data

dt1a.all.or <- rbind(dt1a.fav.or,dt1a.unfav.or)
dt2a.all.or <- rbind(dt2a.fav.or,dt2a.unfav.or)
dt3a.all.or <- rbind(dt3a.fav.or,dt3a.unfav.or)
dt4a.all.or <- rbind(dt4a.fav.or,dt4a.unfav.or)

####

v.oe.1.d <- apply((as.matrix(dt1a.fav.or)>0)*1,2,sum)/dim(dt1a.fav.or)[1]
v.oe.1.r <- apply((as.matrix(dt1a.unfav.or)>0)*1,2,sum)/dim(dt1a.unfav.or)[1]
v.oe.1.a <- apply((as.matrix(dt1a.all.or)>0)*1,2,sum)/dim(dt1a.all.or)[1]

freqmat1 <- rbind((as.matrix(dt1a.fav.or)>0)*1,(as.matrix(dt1a.unfav.or)>0)*1)
freq1 <- apply(freqmat1,2,sum)/dim(freqmat1)[1]
freq1sd <- apply(freqmat1,2,sd)/mean(apply(freqmat1,2,sd) )
diff1 <- v.oe.1.d-v.oe.1.r

v.oe.2.d <- apply((as.matrix(dt2a.fav.or)>0)*1,2,sum)/dim(dt2a.fav.or)[1]
v.oe.2.r <- apply((as.matrix(dt2a.unfav.or)>0)*1,2,sum)/dim(dt2a.unfav.or)[1]
v.oe.2.a <- apply((as.matrix(dt2a.all.or)>0)*1,2,sum)/dim(dt2a.all.or)[1]


freqmat2 <- rbind((as.matrix(dt2a.fav.or)>0)*1,(as.matrix(dt2a.unfav.or)>0)*1)
freq2 <- apply(freqmat2,2,sum)/dim(freqmat2)[1]
freq2sd <- apply(freqmat2,2,sd)/mean(apply(freqmat2,2,sd) )

diff2 <- v.oe.2.d-v.oe.2.r

v.oe.3.d <- apply((as.matrix(dt3a.fav.or)>0)*1,2,sum)/dim(dt3a.fav.or)[1]
v.oe.3.r <- apply((as.matrix(dt3a.unfav.or)>0)*1,2,sum)/dim(dt3a.unfav.or)[1]
v.oe.3.a <- apply((as.matrix(dt3a.all.or)>0)*1,2,sum)/dim(dt3a.all.or)[1]


freqmat3 <- rbind((as.matrix(dt3a.fav.or)>0)*1,(as.matrix(dt3a.unfav.or)>0)*1)
freq3 <- apply(freqmat3,2,sum)/dim(freqmat3)[1]
freq3sd <- apply(freqmat3,2,sd)/mean(apply(freqmat3,2,sd) )
diff3 <- v.oe.3.d-v.oe.3.r

v.oe.4.d <- apply((as.matrix(dt4a.fav.or)>0)*1,2,sum)/dim(dt4a.fav.or)[1]
v.oe.4.r <- apply((as.matrix(dt4a.unfav.or)>0)*1,2,sum)/dim(dt4a.unfav.or)[1]
v.oe.4.a <- apply((as.matrix(dt4a.all.or)>0)*1,2,sum)/dim(dt4a.all.or)[1]

freqmat4 <- rbind((as.matrix(dt4a.fav.or)>0)*1,(as.matrix(dt4a.unfav.or)>0)*1)
freq4 <- apply(freqmat4,2,sum)/dim(freqmat4)[1]
freq4sd <- apply(freqmat4,2,sd)/mean(apply(freqmat4,2,sd) )

diff4 <- v.oe.4.d-v.oe.4.r

nn <- 20

xtable(  cbind( names(sort(diff1)[1:nn]),names(sort(diff2)[1:nn]),names(sort(diff3)[1:nn]),names(sort(diff4)[1:nn]) )  )
xtable(  cbind( names(sort(diff1,decreasing=T)[1:nn]),names(sort(diff2,decreasing=T)[1:nn]),names(sort(diff3,decreasing=T)[1:nn]),names(sort(diff4,decreasing=T)[1:nn]) )  )


###########

v.pr.1.d <- apply((as.matrix(PR.D.dta.1)>0)*1,2,sum)/dim(PR.D.dta.1)[1]
v.pr.1.r <- apply((as.matrix(PR.R.dta.1)>0)*1,2,sum)/dim(PR.R.dta.1)[1]

diffpr1 <- v.pr.1.d-v.pr.1.r

v.pr.2.d <- apply((as.matrix(PR.D.dta.2)>0)*1,2,sum)/dim(PR.D.dta.2)[1]
v.pr.2.r <- apply((as.matrix(PR.R.dta.2)>0)*1,2,sum)/dim(PR.R.dta.2)[1]

diffpr2 <- v.pr.2.d-v.pr.2.r

xtable(  cbind( names(sort(diffpr1)[1:nn]),names(sort(diffpr2)[1:nn])))
xtable(  cbind( names(sort(diffpr1,decreasing=T)[1:nn]),names(sort(diffpr2,decreasing=T)[1:nn]) )  )

identical(names(diff2),names(diff1))
identical(names(diff2),names(diffpr1))

lout <- lm(diff3-diff2 ~ I(diffpr2-diffpr1))
summary(lout)

mn <- min(c(diff1/freq1sd,diff2/freq2sd,diff3/freq3sd,diff4/freq4sd),na.rm=T)
mx <- max(c(diff1/freq1sd,diff2/freq2sd,diff3/freq3sd,diff4/freq4sd),na.rm=T)

par(mfcol=c(2,2))

col1 <- rep("darkred",length(freq1))
col1[diff1 < 0] <- "darkblue"

plot(x=(freq1+0.002),log="x",y=diff1/freq1sd,pch=18,type="n",ylim=c(mn,mx),ylab="Difference/SD",xlab="Frequency",main="July 2009",cex.main=2,cex.lab=1.4)
text(x=(freq1+0.002),y=diff1/freq1sd,names(diff1),cex=.85,col=col1)

col2 <- rep("darkred",length(freq2))
col2[diff2 < 0] <- "darkblue"

plot(x=(freq2+.002),y=diff2/freq2sd,log="x",pch=18,type="n",ylim=c(mn,mx),ylab="Difference/SD",xlab="Frequency",main="November 2009",cex.main=2,cex.lab=1.4)
text(x=(freq2+.002),y=diff2/freq2sd,names(diff2),cex=.85,col=col2)

col3 <- rep("darkred",length(freq3))
col3[diff3 < 0] <- "darkblue"

plot(x=(freq3+.002),y=diff3/freq3sd,log="x",pch=18,type="n",ylim=c(mn,mx),ylab="Difference/SD",xlab="Frequency",main="May 2010",cex.main=2,cex.lab=1.4)
text(x=(freq3+.002),y=diff3/freq3sd,names(diff3),cex=.85,col=col3)

col4 <- rep("darkred",length(freq4))
col4[diff4 < 0] <- "darkblue"

plot(x=(freq4+.002),y=diff4/freq4sd,log="x",pch=18,type="n",ylim=c(mn,mx),ylab="Difference/SD",xlab="Frequency",main="October 2010",cex.main=2,cex.lab=1.4)
text(x=(freq4+.002),y=diff4/freq4sd,names(diff4),cex=.85,col=col4)

#dev.off()

